{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.026049,
     "end_time": "2022-01-20T13:42:48.322998",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.296949",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    " # Augmentations in NLP\n",
    "\n",
    "Data Augmentation techniques in NLP show substantial improvements on datasets with less than 500 observations, as illustrated by the original paper.\n",
    "\n",
    "https://arxiv.org/abs/1901.11196\n",
    "\n",
    "The Paper Considered here is EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_kg_hide-output": true,
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:48.375525Z",
     "iopub.status.busy": "2022-01-20T13:42:48.371750Z",
     "iopub.status.idle": "2022-01-20T13:42:48.384611Z",
     "shell.execute_reply": "2022-01-20T13:42:48.383761Z",
     "shell.execute_reply.started": "2021-10-30T20:27:45.698216Z"
    },
    "papermill": {
     "duration": 0.038939,
     "end_time": "2022-01-20T13:42:48.384789",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.345850",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/kaggle/input/tweet-sentiment-extraction/sample_submission.csv\n",
      "/kaggle/input/tweet-sentiment-extraction/train.csv\n",
      "/kaggle/input/tweet-sentiment-extraction/test.csv\n"
     ]
    }
   ],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load\n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "# Input data files are available in the read-only \"../input/\" directory\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
    "\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
    "    for filename in filenames:\n",
    "        print(os.path.join(dirname, filename))\n",
    "\n",
    "# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
    "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.023217,
     "end_time": "2022-01-20T13:42:48.433644",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.410427",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "#  ***Simple Data Augmentatons Techniques* are:**\n",
    "1. SR : Synonym Replacement \n",
    "2. RD : Random Deletion\n",
    "3. RS : Random Swap\n",
    "4. RI : Random Insertion\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:48.483430Z",
     "iopub.status.busy": "2022-01-20T13:42:48.482425Z",
     "iopub.status.idle": "2022-01-20T13:42:48.671855Z",
     "shell.execute_reply": "2022-01-20T13:42:48.671164Z",
     "shell.execute_reply.started": "2021-10-30T20:28:39.115622Z"
    },
    "papermill": {
     "duration": 0.215349,
     "end_time": "2022-01-20T13:42:48.671972",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.456623",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:48.723181Z",
     "iopub.status.busy": "2022-01-20T13:42:48.722197Z",
     "iopub.status.idle": "2022-01-20T13:42:48.742206Z",
     "shell.execute_reply": "2022-01-20T13:42:48.742726Z",
     "shell.execute_reply.started": "2021-10-30T20:28:40.860125Z"
    },
    "papermill": {
     "duration": 0.047323,
     "end_time": "2022-01-20T13:42:48.742872",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.695549",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>textID</th>\n",
       "      <th>text</th>\n",
       "      <th>selected_text</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>cb774db0d1</td>\n",
       "      <td>I`d have responded, if I were going</td>\n",
       "      <td>I`d have responded, if I were going</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>549e992a42</td>\n",
       "      <td>Sooo SAD I will miss you here in San Diego!!!</td>\n",
       "      <td>Sooo SAD</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>088c60f138</td>\n",
       "      <td>my boss is bullying me...</td>\n",
       "      <td>bullying me</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9642c003ef</td>\n",
       "      <td>what interview! leave me alone</td>\n",
       "      <td>leave me alone</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>358bd9e861</td>\n",
       "      <td>Sons of ****, why couldn`t they put them on t...</td>\n",
       "      <td>Sons of ****,</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       textID                                               text  \\\n",
       "0  cb774db0d1                I`d have responded, if I were going   \n",
       "1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   \n",
       "2  088c60f138                          my boss is bullying me...   \n",
       "3  9642c003ef                     what interview! leave me alone   \n",
       "4  358bd9e861   Sons of ****, why couldn`t they put them on t...   \n",
       "\n",
       "                         selected_text sentiment  \n",
       "0  I`d have responded, if I were going   neutral  \n",
       "1                             Sooo SAD  negative  \n",
       "2                          bullying me  negative  \n",
       "3                       leave me alone  negative  \n",
       "4                        Sons of ****,  negative  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:48.793883Z",
     "iopub.status.busy": "2022-01-20T13:42:48.792856Z",
     "iopub.status.idle": "2022-01-20T13:42:48.802902Z",
     "shell.execute_reply": "2022-01-20T13:42:48.803522Z",
     "shell.execute_reply.started": "2021-10-30T20:28:45.014361Z"
    },
    "papermill": {
     "duration": 0.037286,
     "end_time": "2022-01-20T13:42:48.803669",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.766383",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "list_to_drop = ['textID','selected_text','sentiment']\n",
    "data.drop(list_to_drop,axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:48.858761Z",
     "iopub.status.busy": "2022-01-20T13:42:48.857176Z",
     "iopub.status.idle": "2022-01-20T13:42:48.869115Z",
     "shell.execute_reply": "2022-01-20T13:42:48.869634Z",
     "shell.execute_reply.started": "2021-10-30T20:28:53.374962Z"
    },
    "papermill": {
     "duration": 0.042016,
     "end_time": "2022-01-20T13:42:48.869790",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.827774",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>I`d have responded, if I were going</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Sooo SAD I will miss you here in San Diego!!!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>my boss is bullying me...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>what interview! leave me alone</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Sons of ****, why couldn`t they put them on t...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text\n",
       "0                I`d have responded, if I were going\n",
       "1      Sooo SAD I will miss you here in San Diego!!!\n",
       "2                          my boss is bullying me...\n",
       "3                     what interview! leave me alone\n",
       "4   Sons of ****, why couldn`t they put them on t..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:48.920753Z",
     "iopub.status.busy": "2022-01-20T13:42:48.920132Z",
     "iopub.status.idle": "2022-01-20T13:42:48.925841Z",
     "shell.execute_reply": "2022-01-20T13:42:48.926345Z"
    },
    "papermill": {
     "duration": 0.032704,
     "end_time": "2022-01-20T13:42:48.926499",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.893795",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of examples to be used is : 27481\n"
     ]
    }
   ],
   "source": [
    "print(f\"Total number of examples to be used is : {len(data)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.023903,
     "end_time": "2022-01-20T13:42:48.974896",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.950993",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# 1. Synonym Replacement :\n",
    "\n",
    "Synonym replacement is a technique in which we replace a word by one of its synonyms\n",
    "\n",
    "For identifying relevent Synonyms we use WordNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-10-30T20:28:57.741387Z",
     "iopub.status.busy": "2021-10-30T20:28:57.740759Z",
     "iopub.status.idle": "2021-10-30T20:28:59.44851Z",
     "shell.execute_reply": "2021-10-30T20:28:59.4473Z",
     "shell.execute_reply.started": "2021-10-30T20:28:57.741353Z"
    },
    "papermill": {
     "duration": 0.02503,
     "end_time": "2022-01-20T13:42:49.024269",
     "exception": false,
     "start_time": "2022-01-20T13:42:48.999239",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.024967,
     "end_time": "2022-01-20T13:42:49.074263",
     "exception": false,
     "start_time": "2022-01-20T13:42:49.049296",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "The get_synonyms funtion will return pre-processed list of synonyms of given word\n",
    "\n",
    "Now we will replace the words with synonyms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "_kg_hide-output": true,
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:49.127362Z",
     "iopub.status.busy": "2022-01-20T13:42:49.126363Z",
     "iopub.status.idle": "2022-01-20T13:42:50.804321Z",
     "shell.execute_reply": "2022-01-20T13:42:50.804955Z",
     "shell.execute_reply.started": "2021-10-30T20:29:04.106201Z"
    },
    "papermill": {
     "duration": 1.70632,
     "end_time": "2022-01-20T13:42:50.805141",
     "exception": false,
     "start_time": "2022-01-20T13:42:49.098821",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = []\n",
    "for w in stopwords.words('english'):\n",
    "    stop_words.append(w)\n",
    "print(stop_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:50.858973Z",
     "iopub.status.busy": "2022-01-20T13:42:50.857974Z",
     "iopub.status.idle": "2022-01-20T13:42:50.867549Z",
     "shell.execute_reply": "2022-01-20T13:42:50.868167Z",
     "shell.execute_reply.started": "2021-10-30T20:29:08.154452Z"
    },
    "papermill": {
     "duration": 0.038282,
     "end_time": "2022-01-20T13:42:50.868321",
     "exception": false,
     "start_time": "2022-01-20T13:42:50.830039",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import random\n",
    "from nltk.corpus import wordnet\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:50.922246Z",
     "iopub.status.busy": "2022-01-20T13:42:50.921239Z",
     "iopub.status.idle": "2022-01-20T13:42:50.928950Z",
     "shell.execute_reply": "2022-01-20T13:42:50.929598Z"
    },
    "papermill": {
     "duration": 0.036512,
     "end_time": "2022-01-20T13:42:50.929749",
     "exception": false,
     "start_time": "2022-01-20T13:42:50.893237",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "def get_synonyms(word):\n",
    "    \n",
    "    synonyms = set()\n",
    "    \n",
    "    for syn in wordnet.synsets(word):\n",
    "        for l in syn.lemmas():\n",
    "            synonym = l.name().replace(\"_\", \" \").replace(\"-\", \" \").lower()\n",
    "            synonym = \"\".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])\n",
    "            synonyms.add(synonym) \n",
    "    if word in synonyms:\n",
    "        synonyms.remove(word)\n",
    "    \n",
    "    return list(synonyms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:50.990343Z",
     "iopub.status.busy": "2022-01-20T13:42:50.989619Z",
     "iopub.status.idle": "2022-01-20T13:42:50.992573Z",
     "shell.execute_reply": "2022-01-20T13:42:50.992063Z",
     "shell.execute_reply.started": "2021-10-30T20:29:12.301235Z"
    },
    "papermill": {
     "duration": 0.037801,
     "end_time": "2022-01-20T13:42:50.992688",
     "exception": false,
     "start_time": "2022-01-20T13:42:50.954887",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def synonym_replacement(words, n):    \n",
    "    words = words.split()    \n",
    "    new_words = words.copy()\n",
    "    random_word_list = list(set([word for word in words if word not in stop_words]))\n",
    "    random.shuffle(random_word_list)\n",
    "    num_replaced = 0\n",
    "    \n",
    "    for random_word in random_word_list:\n",
    "        synonyms = get_synonyms(random_word)\n",
    "        \n",
    "        if len(synonyms) >= 1:\n",
    "            synonym = random.choice(list(synonyms))\n",
    "            new_words = [synonym if word == random_word else word for word in new_words]\n",
    "            num_replaced += 1\n",
    "        \n",
    "        if num_replaced >= n: #only replace up to n words\n",
    "            break\n",
    "    sentence = ' '.join(new_words)\n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:51.048587Z",
     "iopub.status.busy": "2022-01-20T13:42:51.047913Z",
     "iopub.status.idle": "2022-01-20T13:42:53.248433Z",
     "shell.execute_reply": "2022-01-20T13:42:53.249126Z",
     "shell.execute_reply.started": "2021-10-30T20:30:20.915417Z"
    },
    "papermill": {
     "duration": 2.231642,
     "end_time": "2022-01-20T13:42:53.249329",
     "exception": false,
     "start_time": "2022-01-20T13:42:51.017687",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Example of Synonym Replacement: The spry brown university fox jumpstart over the lazy detent\n"
     ]
    }
   ],
   "source": [
    "print(f\" Example of Synonym Replacement: {synonym_replacement('The quick brown fox jumps over the lazy dog',4)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.025037,
     "end_time": "2022-01-20T13:42:53.300284",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.275247",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "To Get Larger Diversity of Sentences we could try replacing 1,2 3, .. Words in the given sentence.\n",
    "\n",
    "Now lets get an example from out dataset and try augmenting it so that we could create 3 additional sentences per tweet "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:53.355887Z",
     "iopub.status.busy": "2022-01-20T13:42:53.354846Z",
     "iopub.status.idle": "2022-01-20T13:42:53.363428Z",
     "shell.execute_reply": "2022-01-20T13:42:53.362892Z",
     "shell.execute_reply.started": "2021-10-30T20:30:39.213323Z"
    },
    "papermill": {
     "duration": 0.037159,
     "end_time": "2022-01-20T13:42:53.363547",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.326388",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the free fillin` app on my ipod is fun, im addicted\n"
     ]
    }
   ],
   "source": [
    "trial_sent = data['text'][25]\n",
    "print(trial_sent)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:53.419866Z",
     "iopub.status.busy": "2022-01-20T13:42:53.419144Z",
     "iopub.status.idle": "2022-01-20T13:42:53.442262Z",
     "shell.execute_reply": "2022-01-20T13:42:53.441671Z",
     "shell.execute_reply.started": "2021-10-30T20:30:42.115145Z"
    },
    "papermill": {
     "duration": 0.053023,
     "end_time": "2022-01-20T13:42:53.442379",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.389356",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Example of Synonym Replacement: the free fillin` app on my ipod is fun, im addict\n",
      " Example of Synonym Replacement: the innocent fillin` app on my ipod is fun, im addicted\n",
      " Example of Synonym Replacement: the relinquish fillin` app on my ipod is fun, im addict\n"
     ]
    }
   ],
   "source": [
    "# Create 3 Augmented Sentences per data \n",
    "\n",
    "for n in range(3):\n",
    "    print(f\" Example of Synonym Replacement: {synonym_replacement(trial_sent,n)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.025845,
     "end_time": "2022-01-20T13:42:53.494461",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.468616",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Now we are able to augment this Data :)\n",
    "\n",
    "You can create New colums for the Same text-id  in our tweet - sentiment Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.026065,
     "end_time": "2022-01-20T13:42:53.546762",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.520697",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# 2.Random Deletion (RD)\n",
    "\n",
    "In Random Deletion, we randomly delete a word if a uniformly generated number between 0 and 1 is smaller than a pre-defined threshold. This allows for a random deletion of some words of the sentence.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:53.609176Z",
     "iopub.status.busy": "2022-01-20T13:42:53.608134Z",
     "iopub.status.idle": "2022-01-20T13:42:53.610488Z",
     "shell.execute_reply": "2022-01-20T13:42:53.610937Z",
     "shell.execute_reply.started": "2021-10-30T20:30:45.075507Z"
    },
    "papermill": {
     "duration": 0.037489,
     "end_time": "2022-01-20T13:42:53.611097",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.573608",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def random_deletion(words, p):\n",
    "\n",
    "    words = words.split()\n",
    "    \n",
    "    #obviously, if there's only one word, don't delete it\n",
    "    if len(words) == 1:\n",
    "        return words\n",
    "\n",
    "    #randomly delete words with probability p\n",
    "    new_words = []\n",
    "    for word in words:\n",
    "        r = random.uniform(0, 1)\n",
    "        if r > p:\n",
    "            new_words.append(word)\n",
    "\n",
    "    #if you end up deleting all words, just return a random word\n",
    "    if len(new_words) == 0:\n",
    "        rand_int = random.randint(0, len(words)-1)\n",
    "        return [words[rand_int]]\n",
    "\n",
    "    sentence = ' '.join(new_words)\n",
    "    \n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.025516,
     "end_time": "2022-01-20T13:42:53.662613",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.637097",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Lets test out this Augmentation with our test_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:53.720829Z",
     "iopub.status.busy": "2022-01-20T13:42:53.720180Z",
     "iopub.status.idle": "2022-01-20T13:42:53.724102Z",
     "shell.execute_reply": "2022-01-20T13:42:53.724740Z",
     "shell.execute_reply.started": "2021-10-30T20:30:49.749246Z"
    },
    "papermill": {
     "duration": 0.036255,
     "end_time": "2022-01-20T13:42:53.724883",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.688628",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the free fillin` app on my is fun, addicted\n",
      "free fillin` app on my ipod is im addicted\n",
      "the free on my ipod is fun, im\n"
     ]
    }
   ],
   "source": [
    "print(random_deletion(trial_sent,0.2))\n",
    "print(random_deletion(trial_sent,0.3))\n",
    "print(random_deletion(trial_sent,0.4))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.025912,
     "end_time": "2022-01-20T13:42:53.777171",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.751259",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "This Could help us in reducing Overfitting and may help to imporve our Model Accuracy "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.025805,
     "end_time": "2022-01-20T13:42:53.829355",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.803550",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "\n",
    "# 3. Random Swap (RS)\n",
    "\n",
    "In Random Swap, we randomly swap the order of two words in a sentence.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:53.885998Z",
     "iopub.status.busy": "2022-01-20T13:42:53.885374Z",
     "iopub.status.idle": "2022-01-20T13:42:53.892176Z",
     "shell.execute_reply": "2022-01-20T13:42:53.892768Z",
     "shell.execute_reply.started": "2021-10-30T20:30:52.542732Z"
    },
    "papermill": {
     "duration": 0.037349,
     "end_time": "2022-01-20T13:42:53.892924",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.855575",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def swap_word(new_words):    \n",
    "    random_idx_1 = random.randint(0, len(new_words)-1)\n",
    "    random_idx_2 = random_idx_1\n",
    "    counter = 0    \n",
    "    while random_idx_2 == random_idx_1:\n",
    "        random_idx_2 = random.randint(0, len(new_words)-1)\n",
    "        counter += 1        \n",
    "        if counter > 3:\n",
    "            return new_words\n",
    "    \n",
    "    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] \n",
    "    return new_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:53.949331Z",
     "iopub.status.busy": "2022-01-20T13:42:53.948625Z",
     "iopub.status.idle": "2022-01-20T13:42:53.954549Z",
     "shell.execute_reply": "2022-01-20T13:42:53.955130Z"
    },
    "papermill": {
     "duration": 0.0358,
     "end_time": "2022-01-20T13:42:53.955283",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.919483",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def random_swap(words, n):    \n",
    "    words = words.split()\n",
    "    new_words = words.copy()\n",
    "    # n is the number of words to be swapped\n",
    "    for _ in range(n):\n",
    "        new_words = swap_word(new_words)\n",
    "        \n",
    "    sentence = ' '.join(new_words)    \n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:54.012526Z",
     "iopub.status.busy": "2022-01-20T13:42:54.011866Z",
     "iopub.status.idle": "2022-01-20T13:42:54.019318Z",
     "shell.execute_reply": "2022-01-20T13:42:54.018553Z"
    },
    "papermill": {
     "duration": 0.037555,
     "end_time": "2022-01-20T13:42:54.019479",
     "exception": false,
     "start_time": "2022-01-20T13:42:53.981924",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the free addicted app on my ipod is fun, im fillin`\n",
      "fun, free fillin` app on my ipod is im the addicted\n",
      "free app fillin` the on addicted ipod is fun, im my\n"
     ]
    }
   ],
   "source": [
    "print(random_swap(trial_sent,1))\n",
    "print(random_swap(trial_sent,2))\n",
    "print(random_swap(trial_sent,3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.027264,
     "end_time": "2022-01-20T13:42:54.074453",
     "exception": false,
     "start_time": "2022-01-20T13:42:54.047189",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "This Random Swapping will help to make our models robust and may inturn help in text classification. \n",
    "\n",
    "High order of swapping may downgrade the model\n",
    "\n",
    "There is a high chance to loose semantics of language so be careful while using this augmentaion.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.026613,
     "end_time": "2022-01-20T13:42:54.128375",
     "exception": false,
     "start_time": "2022-01-20T13:42:54.101762",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# 4. Random Insertion (RI)\n",
    "Finally, in Random Insertion, we randomly insert synonyms of a word at a random position.\n",
    "\n",
    "Data augmentation\n",
    "operations should not change the true label of\n",
    "a sentence, as that would introduce unnecessary\n",
    "noise into the data. Inserting a synonym of a word\n",
    "in a sentence, opposed to a random word, is more\n",
    "likely to be relevant to the context and retain the\n",
    "original label of the sentence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:54.185704Z",
     "iopub.status.busy": "2022-01-20T13:42:54.185007Z",
     "iopub.status.idle": "2022-01-20T13:42:54.194800Z",
     "shell.execute_reply": "2022-01-20T13:42:54.195378Z"
    },
    "papermill": {
     "duration": 0.040002,
     "end_time": "2022-01-20T13:42:54.195536",
     "exception": false,
     "start_time": "2022-01-20T13:42:54.155534",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def random_insertion(words, n):    \n",
    "    words = words.split()\n",
    "    new_words = words.copy()    \n",
    "    for _ in range(n):\n",
    "        add_word(new_words)        \n",
    "    sentence = ' '.join(new_words)\n",
    "    return sentence\n",
    "\n",
    "def add_word(new_words):    \n",
    "    synonyms = []\n",
    "    counter = 0\n",
    "    \n",
    "    while len(synonyms) < 1:\n",
    "        random_word = new_words[random.randint(0, len(new_words)-1)]\n",
    "        synonyms = get_synonyms(random_word)\n",
    "        counter += 1\n",
    "        if counter >= 10:\n",
    "            return        \n",
    "    random_synonym = synonyms[0]\n",
    "    random_idx = random.randint(0, len(new_words)-1)\n",
    "    new_words.insert(random_idx, random_synonym)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:54.253628Z",
     "iopub.status.busy": "2022-01-20T13:42:54.252968Z",
     "iopub.status.idle": "2022-01-20T13:42:54.261794Z",
     "shell.execute_reply": "2022-01-20T13:42:54.262332Z"
    },
    "papermill": {
     "duration": 0.039465,
     "end_time": "2022-01-20T13:42:54.262479",
     "exception": false,
     "start_time": "2022-01-20T13:42:54.223014",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the free fillin` app on my addict ipod is fun, im addicted\n",
      "the complimentary free fillin` app on my ipod along is fun, im addicted\n",
      "the free along fillin` app addict on my ipod along is fun, im addicted\n"
     ]
    }
   ],
   "source": [
    "print(random_insertion(trial_sent,1))\n",
    "print(random_insertion(trial_sent,2))\n",
    "print(random_insertion(trial_sent,3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:54.321738Z",
     "iopub.status.busy": "2022-01-20T13:42:54.321012Z",
     "iopub.status.idle": "2022-01-20T13:42:54.325880Z",
     "shell.execute_reply": "2022-01-20T13:42:54.326405Z"
    },
    "papermill": {
     "duration": 0.036453,
     "end_time": "2022-01-20T13:42:54.326548",
     "exception": false,
     "start_time": "2022-01-20T13:42:54.290095",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def aug(sent,n,p):\n",
    "    print(f\" Original Sentence : {sent}\")\n",
    "    print(f\" SR Augmented Sentence : {synonym_replacement(sent,n)}\")\n",
    "    print(f\" RD Augmented Sentence : {random_deletion(sent,p)}\")\n",
    "    print(f\" RS Augmented Sentence : {random_swap(sent,n)}\")\n",
    "    print(f\" RI Augmented Sentence : {random_insertion(sent,n)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-01-20T13:42:54.387083Z",
     "iopub.status.busy": "2022-01-20T13:42:54.386400Z",
     "iopub.status.idle": "2022-01-20T13:42:54.402037Z",
     "shell.execute_reply": "2022-01-20T13:42:54.401238Z"
    },
    "papermill": {
     "duration": 0.046853,
     "end_time": "2022-01-20T13:42:54.402403",
     "exception": false,
     "start_time": "2022-01-20T13:42:54.355550",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Original Sentence : the free fillin` app on my ipod is fun, im addicted\n",
      " SR Augmented Sentence : the disembarrass fillin` app on my ipod is fun, im hook\n",
      " RD Augmented Sentence : the free app on my ipod fun, im addicted\n",
      " RS Augmented Sentence : on free fillin` ipod is my the app fun, im addicted\n",
      " RI Augmented Sentence : the free fillin` app on gratis addict my ipod is complimentary make up fun, im addicted\n"
     ]
    }
   ],
   "source": [
    "aug(trial_sent,4,0.3)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "papermill": {
   "duration": 10.912265,
   "end_time": "2022-01-20T13:42:54.539428",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2022-01-20T13:42:43.627163",
   "version": "2.1.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
